Load and Clean the Data

data = read.csv("day.csv")
str(data)
## 'data.frame':    731 obs. of  16 variables:
##  $ instant   : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ dteday    : Factor w/ 731 levels "2011-01-01","2011-01-02",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ season    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ yr        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ mnth      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ holiday   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday   : int  6 0 1 2 3 4 5 6 0 1 ...
##  $ workingday: int  0 0 1 1 1 1 1 0 0 1 ...
##  $ weathersit: int  2 2 1 1 1 1 2 2 1 1 ...
##  $ temp      : num  0.344 0.363 0.196 0.2 0.227 ...
##  $ atemp     : num  0.364 0.354 0.189 0.212 0.229 ...
##  $ hum       : num  0.806 0.696 0.437 0.59 0.437 ...
##  $ windspeed : num  0.16 0.249 0.248 0.16 0.187 ...
##  $ casual    : int  331 131 120 108 82 88 148 68 54 41 ...
##  $ registered: int  654 670 1229 1454 1518 1518 1362 891 768 1280 ...
##  $ cnt       : int  985 801 1349 1562 1600 1606 1510 959 822 1321 ...

All of the features are numeric, we will change the categorical ones to factors and assign meaningful levels:

data$season = as.factor(data$season)
levels(data$season) <- c("spring", "summer", "fall", "winter") # load it as a factor with levels

data$holiday = as.factor(data$holiday)
levels(data$holiday) = c("no", "yes") # load it as a factor with levels

data$workingday = as.factor(data$workingday)
levels(data$workingday) = c("no", "yes") # load it as a factor with levels

data$weathersit = as.factor(data$weathersit)
levels(data$weathersit) = c("Clearish", "Misty", "LightPrecip") # load it as a factor with levels

data$weekday = as.factor(data$weekday)
levels(data$weekday) = c("Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat") # load it as a factor with levels

data$mnth = as.factor(data$mnth)
levels(data$mnth) = c("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec") # load it as a factor with levels

data$yr = as.factor(data$yr)
levels(data$yr) = c("2011", "2012") # load it as a factor with levels

str(data)
## 'data.frame':    731 obs. of  16 variables:
##  $ instant   : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ dteday    : Factor w/ 731 levels "2011-01-01","2011-01-02",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ season    : Factor w/ 4 levels "spring","summer",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ yr        : Factor w/ 2 levels "2011","2012": 1 1 1 1 1 1 1 1 1 1 ...
##  $ mnth      : Factor w/ 12 levels "Jan","Feb","Mar",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ holiday   : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ weekday   : Factor w/ 7 levels "Sun","Mon","Tue",..: 7 1 2 3 4 5 6 7 1 2 ...
##  $ workingday: Factor w/ 2 levels "no","yes": 1 1 2 2 2 2 2 1 1 2 ...
##  $ weathersit: Factor w/ 3 levels "Clearish","Misty",..: 2 2 1 1 1 1 2 2 1 1 ...
##  $ temp      : num  0.344 0.363 0.196 0.2 0.227 ...
##  $ atemp     : num  0.364 0.354 0.189 0.212 0.229 ...
##  $ hum       : num  0.806 0.696 0.437 0.59 0.437 ...
##  $ windspeed : num  0.16 0.249 0.248 0.16 0.187 ...
##  $ casual    : int  331 131 120 108 82 88 148 68 54 41 ...
##  $ registered: int  654 670 1229 1454 1518 1518 1362 891 768 1280 ...
##  $ cnt       : int  985 801 1349 1562 1600 1606 1510 959 822 1321 ...

We note that although there are four levels to the data specified in the dataset description, only three of those levels actually occur in the data.


Check for Missing Data

sum(is.na(data))
## [1] 0

There does not appear any missing data.


EDA

Distribution of Target

summary(data$cnt)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      22    3152    4548    4504    5956    8714
hist(data$cnt, col = "lightslateblue", main = "Histogram of Total Ridership Count", xlab = "Total Ridership Count")

It seems sort of normally distributed, although since it is a count it should follow a Poisson distribution rather than normal.

Pairs plots between numeric features

pairs(data[,10:16])

It looks like there are relationships between temp and cnt, although not necessarily linear. The other relationships are not so clear.

Correlation Between Numeric Features

cor(data[,10:16])
##                  temp      atemp         hum  windspeed      casual
## temp        1.0000000  0.9917016  0.12696294 -0.1579441  0.54328466
## atemp       0.9917016  1.0000000  0.13998806 -0.1836430  0.54386369
## hum         0.1269629  0.1399881  1.00000000 -0.2484891 -0.07700788
## windspeed  -0.1579441 -0.1836430 -0.24848910  1.0000000 -0.16761335
## casual      0.5432847  0.5438637 -0.07700788 -0.1676133  1.00000000
## registered  0.5400120  0.5441918 -0.09108860 -0.2174490  0.39528245
## cnt         0.6274940  0.6310657 -0.10065856 -0.2345450  0.67280443
##            registered        cnt
## temp        0.5400120  0.6274940
## atemp       0.5441918  0.6310657
## hum        -0.0910886 -0.1006586
## windspeed  -0.2174490 -0.2345450
## casual      0.3952825  0.6728044
## registered  1.0000000  0.9455169
## cnt         0.9455169  1.0000000

Distributions of Numeric Features

summary(data[,10:13])
##       temp             atemp              hum           windspeed      
##  Min.   :0.05913   Min.   :0.07907   Min.   :0.0000   Min.   :0.02239  
##  1st Qu.:0.33708   1st Qu.:0.33784   1st Qu.:0.5200   1st Qu.:0.13495  
##  Median :0.49833   Median :0.48673   Median :0.6267   Median :0.18097  
##  Mean   :0.49538   Mean   :0.47435   Mean   :0.6279   Mean   :0.19049  
##  3rd Qu.:0.65542   3rd Qu.:0.60860   3rd Qu.:0.7302   3rd Qu.:0.23321  
##  Max.   :0.86167   Max.   :0.84090   Max.   :0.9725   Max.   :0.50746
par(mfrow=c(2,2))
hist(data[,10], main="Temp", xlab = "Temperature (Celsius)", col = "tomato")

hist(data[,11], main="ATemp", xlab = "Normalized Temperature", col = "tomato2")

hist(data[,12], main="Humidity", xlab = "Humidity", col = "turquoise1")

hist(data[,13], main="Windspeed", xlab = "Windspeed", col = "skyblue")

These do not appear to be normally distributed, although Windspeed could possibly be turned into normal distributions with transformations and Humidity is fairly close to normal.

Distributions of Categorical Features

par(mfrow=c(1,3))
barplot(prop.table(table(data$season)), col = 1:4, main = "Distribution of Season", xlab = "Season", ylab = "Count")

barplot(prop.table(table(data$mnth)), col = 1:12,  main = "Distribution of Months", xlab = "Month", ylab = "Count")

barplot(prop.table(table(data$weekday)), col = 1:12,  main = "Distribution of Weekdays", xlab = "Weekday?", ylab = "Count")

barplot(prop.table(table(data$holiday)), col = 6:12,  main = "Distribution of Holidays", xlab = "Holiday", ylab = "Count")

barplot(prop.table(table(data$workingday)), col = 8:12,  main = "Distribution of Working Days", xlab = "Working Day", ylab = "Count")

barplot(prop.table(table(data$weathersit)), col = 10:12,  main = "Distribution of Weather Types", xlab = "Weather Type", ylab = "Count")

It appears that Months, Seasons and Weekdays are distributed uniformly. However, there are very few holidays, more working days than non-working days and the weather seems to be mostly clear.

More Plots

# create a color palette
palette(brewer.pal(n = 12, name = "Set3"))

par(mfrow=c(1,3))
barplot(aggregate(x = data$cnt, by = list(data$mnth), FUN = sum)$x, names = c("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"), main = "Total Ridership By Month", col = 1:12)

palette(brewer.pal(n = 4, name = "Accent"))
barplot(aggregate(x = data$cnt, by = list(data$season), FUN = sum)$x, names = c("spring", "summer", "fall", "winter"), main = "Total Ridership By Season", col = 1:4)

palette(brewer.pal(n = 4, name = "Set1"))
barplot(aggregate(x = data$cnt, by = list(data$weathersit), FUN = sum)$x, names = c("Clearish", "Misty", "LightPrecip"), main = "Total Ridership By Weather Type", col = 1:4)

There appears to be relationships between the Month, Season and Weather and Total Ridership. However the Weather resembles the distribution of Weather, so this relationship may not be so important.

par(mfrow=c(1,3))
palette(brewer.pal(n = 4, name = "Set3"))
barplot(aggregate(x = data$cnt, by = list(data$holiday), FUN = sum)$x, names = c("No", "Yes"), main = "Total Ridership By Holiday", col = 1:12)

palette(brewer.pal(n = 4, name = "Accent"))
barplot(aggregate(x = data$cnt, by = list(data$workingday), FUN = sum)$x, names = c("No", "Yes"), main = "Total Ridership By Workingday", col = 1:4)

palette(brewer.pal(n = 7, name = "Set2"))
barplot(aggregate(x = data$cnt, by = list(data$weekday), FUN = sum)$x, names = c("Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"), main = "Total Ridership By Weekday", col = 1:7)

While there is a higher ridership on non-holidays and workingdays, these very closely resemble the distributions of Holidays and Workingdays in the data, so these may not be very useful as predictors.

par(mfrow=c(1,3))
palette(brewer.pal(n = 6, name = "Dark2"))
plot(data$temp, data$cnt, main = "Temp vs Total Ridership", xlab = "Temp", ylab = "Total Ridership", col = 1)

plot(data$hum, data$cnt, main = "Humidity vs Total Ridership", ylab = "Total Ridership", xlab = "Humidity", col = 2)

plot(data$windspeed, data$cnt, main = "Windspeed vs Total Ridership", ylab = "Total Ridership", xlab = "Windspeed", col = 3)

It seems that there is a somewhat linear relationship between temperature and total ridership, although it may be more polynomial. It is not clear what the relationships between Humidity and Windspeed and Ridership are.

par(mfrow=c(1,3))
plot(data$windspeed, data$hum, main = "Windspeed vs Humidity", ylab = "Humidity", xlab = "Windspeed", col = 4)

plot(data$windspeed, data$temp, main = "Windspeed vs Temp", ylab = "Temp", xlab = "Windspeed", col = 5)

plot(data$hum, data$temp, main = "Humidity vs Temp", ylab = "Temp", xlab = "Humidity", col = 6)

It seems as if these numerical features may be related somehow, although they do not seem to be related linearly.

Finding Interactions

First we will look at boxplots of Total Ridership by the categorical features to see if anything stands out as worth exploring:

palette(brewer.pal(n = 12, name = "Set3"))
par(mfrow = c(1,2))
boxplot(cnt ~ weathersit, data = data, col = 1:4, main = "Count by Weather", ylab = "Total Ridership")
boxplot(cnt ~ season, data = data, col = 5:8, main = "Count by Season", ylab = "Total Ridership")

boxplot(cnt ~ mnth, data = data, col = 1:12, main = "Count by Month", ylab = "Total Ridership")
boxplot(cnt ~ weekday, data = data, col = 1:7, main = "Count by Weekday", ylab = "Total Ridership")

boxplot(cnt ~ workingday, data = data, col = 8:10, main = "Count by Working Day", ylab = "Total Ridership")
boxplot(cnt ~ holiday, data = data, col = 10:12, main = "Count by Holiday", ylab = "Total Ridership")

It appears that Weather, Season and Month may be worth further exploration, Weekday and Working Day don’t seem to be of much interest, and while the mean Ridership on Holidays is lower than on non-Holidays, it is not significantly lower.

We will now look at some interactions between the categorical features we identified above and some numerical features.

par(mfrow=c(1,2))
palette(brewer.pal(n = 4, name = "Set1"))
plot(data$temp, data$cnt, col = data$season, pch = 20, main = "Ridership vs Temp by Season", xlab = "Temperature", ylab = "Ridership")
legend("topleft", legend = c("Spring", "Summer", "Fall", "Winter"), col = 1:5, lwd = 1, lty = c(0,0), pch = 20)

plot(data$hum, data$cnt, col = data$season, pch = 20, main = "Ridership vs Humidity by Season", xlab = "Humidity", ylab = "Ridership")
legend("topleft", legend = c("Spring", "Summer", "Fall", "Winter"), col = 1:5, lwd = 1, lty = c(0,0), pch = 20)

As we would expect the temperature to be correlated to the season the Ridership vs Temperature plot doesn’t show any interesting patterns. However, the Ridership vs Humidity plot has vertical clusters which could indicate interesting correlations.

par(mfrow=c(1,2))
palette(brewer.pal(n = 4, name = "Set2"))
plot(data$temp, data$cnt, col = data$weathersit, pch = 20, main = "Ridership vs Temp by Weather", xlab = "Temperature", ylab = "Ridership")
legend("topleft", legend = c("Clear", "Misty", "Light Precip"), col = 1:5, lwd = 1, lty = c(0,0), pch = 20)

plot(data$hum, data$cnt, col = data$weathersit, pch = 20, main = "Ridership vs Humidity by Weather", xlab = "Humidity", ylab = "Ridership")
legend("topleft", legend = c("Clear", "Misty", "Light Precip"), col = 1:5, lwd = 1, lty = c(0,0), pch = 20)

Similarly, as we would expect Humidity to be correlated with Weather the Ridership vs Humidity plot doesn’t show any interesting patterns while the Ridership vs Temperature plot indicates that there may be some

par(mfrow = c(1,2))
palette(brewer.pal(n = 12, name = "Set3"))
plot(data$temp, data$cnt, col = data$mnth, pch = 20, main = "Ridership vs Temp by Month", xlab = "Temp", ylab = "Ridership")

plot(data$hum, data$cnt, col = data$mnth, pch = 20, main = "Ridership vs Humidity by Month", xlab = "Temp", ylab = "Ridership")
legend("topleft", legend = c("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"), col = 1:12, lwd = 1, lty = c(0,0), pch = 20)

It is difficult to draw conclusions from the plots above, however both interaction may merit further investigation.

Lets remove non useful features:

data=data[,c(-1,-2,-14,-15)] # remove instance, date, causal, registered variable since those are not needed for our model.
# function to calculate leave out out cross validated rmse
calc_loocv_rmse = function(model) {
  temp=(resid(model) / (1 - hatvalues(model)))^2
  temp=temp[is.finite(temp)]
  sqrt(mean(temp))
}


Multiple linear regression and Residual diagnostics


Lets separate numerical and categorical variable

numerical <- unlist(lapply(data, is.numeric)) # contains boolean value against each variable indicating whether that variable is a numeric or not


Lets first start by using only the numerical predictors to model the target variable

data_numerical= data[, numerical] # get the target and all the numerical columns
bike_mod_num = lm(cnt ~ ., data = data_numerical) # model with all numerical variables
summary(bike_mod_num)[["coefficients"]] # get the cofficeints
##              Estimate Std. Error    t value     Pr(>|t|)
## (Intercept)  3860.368   355.3890 10.8623754 1.426590e-25
## temp         2111.814  2282.1976  0.9253422 3.550954e-01
## atemp        5139.152  2576.9972  1.9942406 4.649933e-02
## hum         -3149.110   383.9943 -8.2009286 1.081541e-15
## windspeed   -4528.675   721.0854 -6.2803588 5.815379e-10
summary(bike_mod_num)[["adj.r.squared"]] # get the adjusted r-squared 
## [1] 0.460878
calc_loocv_rmse(bike_mod_num) # get the loocv rmse
## [1] 1456.515
par(mfrow = c(2, 2))
plot(bike_mod_num,col = 'dodgerblue') # do diagnostics


Findings:


Dummy variables


Lets now use both the categorical and numerical predictor and see if we get some lift in the models performance.

bike_mod_all=lm(cnt~., data=data) # model with all the variables
summary(bike_mod_all)[["coefficients"]] # get the cofficeints
##                          Estimate Std. Error     t value      Pr(>|t|)
## (Intercept)            1485.84392  239.74649  6.19756289  9.774832e-10
## seasonsummer            884.71081  179.49244  4.92895861  1.032402e-06
## seasonfall              832.70222  213.12943  3.90702600  1.024766e-04
## seasonwinter           1575.35064  181.00081  8.70355585  2.279871e-17
## yr2012                 2019.73543   58.22037 34.69121792 2.294966e-154
## mnthFeb                 131.02513  143.77618  0.91131318  3.624432e-01
## mnthMar                 542.82773  165.43291  3.28125608  1.084575e-03
## mnthApr                 451.16562  247.56686  1.82239911  6.881974e-02
## mnthMay                 735.50548  267.62770  2.74824120  6.145473e-03
## mnthJun                 515.40479  282.41068  1.82501876  6.842301e-02
## mnthJul                  30.79664  313.82098  0.09813442  9.218536e-01
## mnthAug                 444.94895  303.16514  1.46767848  1.426395e-01
## mnthSep                1004.17283  265.12279  3.78757646  1.651562e-04
## mnthOct                 519.67428  241.55359  2.15138294  3.178651e-02
## mnthNov                -116.69328  230.77620 -0.50565560  6.132572e-01
## mnthDec                 -89.59150  182.21449 -0.49168152  6.230982e-01
## holidayyes             -589.69719  180.36226 -3.26951542  1.129889e-03
## weekdayMon              212.05386  109.49409  1.93666948  5.318673e-02
## weekdayTue              309.52763  107.13399  2.88916364  3.981733e-03
## weekdayWed              381.35742  107.47774  3.54824567  4.136266e-04
## weekdayThu              386.34297  107.52858  3.59293290  3.498183e-04
## weekdayFri              436.98377  107.43933  4.06726072  5.296026e-05
## weekdaySat              440.45884  106.56233  4.13334457  4.007227e-05
## weathersitMisty        -462.53813   77.08692 -6.00021534  3.156422e-09
## weathersitLightPrecip -1965.08654  197.05173 -9.97244020  5.386219e-22
## temp                   2855.01072 1398.15623  2.04198262  4.152649e-02
## atemp                  1786.15738 1462.11957  1.22162196  2.222607e-01
## hum                   -1535.46844  292.44826 -5.25039352  2.013660e-07
## windspeed             -2823.29668  414.55208 -6.81047529  2.091887e-11
summary(bike_mod_all)[["adj.r.squared"]] # get the adjusted r-squared
## [1] 0.8423198
calc_loocv_rmse(bike_mod_all) # get the loocv rmse
## [1] 794.7767
par(mfrow = c(2, 2))
plot(bike_mod_all,col = 'dodgerblue') # do diagnostics


Findings:


Based upon the results it would be good to test for the significance for few of the categorical variables which are listed below:

bike_mod_w_month=lm(cnt~.-mnth, data=data) # model without month
bike_mod_w_weekday=lm(cnt~.-weekday, data=data) # model without weekday
bike_mod_w_workingday=lm(cnt~.-workingday, data=data) # model without workingday
anova(bike_mod_w_month,bike_mod_w_weekday,bike_mod_w_workingday,bike_mod_all) # do avova test
## Analysis of Variance Table
## 
## Model 1: cnt ~ (season + yr + mnth + holiday + weekday + workingday + 
##     weathersit + temp + atemp + hum + windspeed) - mnth
## Model 2: cnt ~ (season + yr + mnth + holiday + weekday + workingday + 
##     weathersit + temp + atemp + hum + windspeed) - weekday
## Model 3: cnt ~ (season + yr + mnth + holiday + weekday + workingday + 
##     weathersit + temp + atemp + hum + windspeed) - workingday
## Model 4: cnt ~ season + yr + mnth + holiday + weekday + workingday + weathersit + 
##     temp + atemp + hum + windspeed
##   Res.Df       RSS Df Sum of Sq       F    Pr(>F)    
## 1    713 472452877                                   
## 2    707 428442679  6  44010198 12.3957 2.683e-13 ***
## 3    702 415401688  5  13040991  4.4077 0.0005852 ***
## 4    702 415401688  0         0                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1


Findings:


Lets fit the model again, without the working day variable.

data_2= data[,c(-6)] # remove working day variable
bike_mod_all_2=lm(cnt~., data=data_2) # model with all remaining variable
summary(bike_mod_all_2)[["adj.r.squared"]] # get the adjusted r-squared
## [1] 0.8423198
calc_loocv_rmse(bike_mod_all_2) # get the loocv rmse
## [1] 794.7767


This looks like a good starting point. We can now get started with identifying multi col linearity in the model, we will start looking at VIF to understand if we have multi col linearity.

library(faraway)
vif(bike_mod_all_2)
##          seasonsummer            seasonfall          seasonwinter 
##              7.496334             10.720030              7.455172 
##                yr2012               mnthFeb               mnthMar 
##              1.046828              1.835947              2.624298 
##               mnthApr               mnthMay               mnthJun 
##              5.704404              6.868018              7.423137 
##               mnthJul               mnthAug               mnthSep 
##              9.443506              8.813082              6.542133 
##               mnthOct               mnthNov               mnthDec 
##              5.594951              4.956867              3.183722 
##            holidayyes            weekdayMon            weekdayTue 
##              1.121296              1.821782              1.730242 
##            weekdayWed            weekdayThu            weekdayFri 
##              1.741363              1.743011              1.740119 
##            weekdaySat       weathersitMisty weathersitLightPrecip 
##              1.725530              1.642310              1.338411 
##                  temp                 atemp                   hum 
##             80.806689             70.036722              2.140362 
##             windspeed 
##              1.273296


As suspected before temp and atemp has high level of col linearity.Lets get the partial correlation coefficient for the temp variable and see if its useful for the model

temp_model=lm(temp~.-cnt, data=data_2)
cor(resid(bike_mod_all_2), resid(temp_model))
## [1] 2.286334e-16


Since the value is quite small , we can consider removing temp variable from the model. Although multi col linearity might not have much impact on the prediction but it does have an impact on the inference of the model.

data_3= data[,c(-6, -8)]
bike_mod_all_3=lm(cnt~., data=data_3)
summary(bike_mod_all_3)[["adj.r.squared"]] # get the adjusted r-squared
## [1] 0.8416089
calc_loocv_rmse(bike_mod_all_3) # get the loocv rmse
## [1] 791.1807


Outlier diagnostics


Next we would want to also check for potential outliers , we have 3 ways of doing it :

We will be using cooks distance to identify any such outlier and see the effect of it on the model.

First lets calculate no of observation flagged by cooks distance:

sum(cooks.distance(bike_mod_all_3) > 4 / length(cooks.distance(bike_mod_all_3)))
## [1] 42


Next we can consider fitting a model after removing these observation

cokks_distance = cooks.distance(bike_mod_all_3)
bike_mod_all_4 = lm(cnt ~.,
                    data = data_3,
                    subset = cokks_distance <= 4 / length(cokks_distance))
summary(bike_mod_all_4)[["adj.r.squared"]] # get the adjusted r-squared
## [1] 0.9093198
calc_loocv_rmse(bike_mod_all_4) # get the loocv rmse
## [1] 584.7121
par(mfrow = c(2, 2))
plot(bike_mod_all_4,col = 'dodgerblue') # do diagnostics


Findings:

Interaction


Lets include the interactions and see if it helps improve upon the models performance.

bike_mod_all_5 = lm(cnt ~.^2,
                    data = data_3,
                    subset = cokks_distance <= 4 / length(cokks_distance))
summary(bike_mod_all_5)[["adj.r.squared"]] # get the adjusted r-squared
## [1] 0.946074
calc_loocv_rmse(bike_mod_all_5) # get the loocv rmse
## [1] 610.4657
par(mfrow = c(2, 2))
plot(bike_mod_all_5,col = 'dodgerblue') # do diagnostics

length(coef(bike_mod_all_5)) # get the number of params
## [1] 305


Findings:


Model selection


Next we will use a model selection criteria to rule out predictors that are not useful since after addition of interaction we have a lot of predictors.

bike_mod_all_6=step(bike_mod_all_5, trace=0, direction="backward")
summary(bike_mod_all_6)[["adj.r.squared"]] # get the adjusted r-squared
## [1] 0.9475997
calc_loocv_rmse(bike_mod_all_6) # get the loocv rmse
## [1] 482.5356
par(mfrow = c(2, 2))
plot(bike_mod_all_6,col = 'dodgerblue') # do diagnostics

length(coef(bike_mod_all_6)) # get the no of params
## [1] 127


Findings:


Polynomial regression


Lets also check if addition polynomial features turns out useful for the model

temp_mod = lm(cnt ~ .+I(atemp^2)+I(hum^2)+I(windspeed^2),
                    data = data_3,
                    subset = cokks_distance <= 4 / length(cokks_distance))
bike_mod_all_7=step(temp_mod, trace=0, direction="backward")
summary(bike_mod_all_7)[["adj.r.squared"]] # get the adjusted r-squared
## [1] 0.9185862
calc_loocv_rmse(bike_mod_all_7) # get the loocv rmse
## [1] 554.8853
par(mfrow = c(2, 2))
plot(bike_mod_all_7,col = 'dodgerblue') # do diagnostics


Findings:


Transformations


We will also want to check if the transformation of some variables helps improve the model.

temp_m = lm(log(cnt) ~.^2,
                    data = data_3,
                    subset = cokks_distance <= 4 / length(cokks_distance))
bike_mod_all_8=step(temp_m, trace=0, direction="backward")
summary(bike_mod_all_8)[["adj.r.squared"]] # get the adjusted r-squared
## [1] 0.9376945
par(mfrow = c(2, 2))
plot(bike_mod_all_8,col = 'dodgerblue') # do diagnostics

length(coef(bike_mod_all_8)) # get the number of params
## [1] 114

Findings: